In [1]:
%matplotlib inline
In [28]:
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:80% !important; }</style>"))
In [30]:
import urllib
import os
import numpy as np
import pandas as pd
import geopandas as gpd
from geopandas import GeoDataFrame, read_file
from datetime import datetime, timedelta
from pyproj import CRS

import sys
sys.path.append("..")
import movingpandas as mpd

import warnings
warnings.simplefilter("ignore")

import hvplot.pandas # seems to be necessary for the following import to work
from holoviews import opts
opts.defaults(opts.Overlay(active_tools=['wheel_zoom']))
In [4]:
df = pd.read_csv("ThermochronTracking Elephants Kruger 2007.csv")
df['timestamps'] = pd.to_datetime(df['timestamp'])
df = df.set_index('timestamps').tz_localize(None)
print("This dataset contains {} records.\nThe first lines are:".format(len(df)))
df.head()
ndf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(df['location-long'], df['location-lat']))
ndf.head()
This dataset contains 283688 records.
The first lines are:
Out[4]:
event-id visible timestamp location-long location-lat external-temperature sensor-type individual-taxon-canonical-name tag-local-identifier individual-local-identifier study-name geometry
timestamps
2007-08-13 00:30:00 9421351127 True 2007-08-13 00:30:00.000 31.87091 -24.81373 24.0 gps Loxodonta africana AM105 AM105 ThermochronTracking Elephants Kruger 2007 POINT (31.87091 -24.81373)
2007-08-13 02:00:00 9421351128 True 2007-08-13 02:00:00.000 31.87399 -24.81483 23.0 gps Loxodonta africana AM105 AM105 ThermochronTracking Elephants Kruger 2007 POINT (31.87399 -24.81483)
2007-08-13 03:31:00 9421351129 True 2007-08-13 03:31:00.000 31.87724 -24.81673 21.0 gps Loxodonta africana AM105 AM105 ThermochronTracking Elephants Kruger 2007 POINT (31.87724 -24.81673)
2007-08-13 04:00:00 9421351130 True 2007-08-13 04:00:00.000 31.87822 -24.81569 21.0 gps Loxodonta africana AM105 AM105 ThermochronTracking Elephants Kruger 2007 POINT (31.87822 -24.81569)
2007-08-13 06:00:00 9421351131 True 2007-08-13 06:00:00.000 31.89554 -24.79870 22.0 gps Loxodonta africana AM105 AM105 ThermochronTracking Elephants Kruger 2007 POINT (31.89554 -24.79870)
In [36]:
ndf.columns
Out[36]:
Index(['event-id', 'visible', 'timestamp', 'location-long', 'location-lat',
       'external-temperature', 'sensor-type',
       'individual-taxon-canonical-name', 'tag-local-identifier',
       'individual-local-identifier', 'study-name', 'geometry'],
      dtype='object')
In [5]:
ndf.rename(columns={"location-long": "long", "location-lat": "lat"}, inplace=True)
ndf.head()
Out[5]:
event-id visible timestamp long lat external-temperature sensor-type individual-taxon-canonical-name tag-local-identifier individual-local-identifier study-name geometry
timestamps
2007-08-13 00:30:00 9421351127 True 2007-08-13 00:30:00.000 31.87091 -24.81373 24.0 gps Loxodonta africana AM105 AM105 ThermochronTracking Elephants Kruger 2007 POINT (31.87091 -24.81373)
2007-08-13 02:00:00 9421351128 True 2007-08-13 02:00:00.000 31.87399 -24.81483 23.0 gps Loxodonta africana AM105 AM105 ThermochronTracking Elephants Kruger 2007 POINT (31.87399 -24.81483)
2007-08-13 03:31:00 9421351129 True 2007-08-13 03:31:00.000 31.87724 -24.81673 21.0 gps Loxodonta africana AM105 AM105 ThermochronTracking Elephants Kruger 2007 POINT (31.87724 -24.81673)
2007-08-13 04:00:00 9421351130 True 2007-08-13 04:00:00.000 31.87822 -24.81569 21.0 gps Loxodonta africana AM105 AM105 ThermochronTracking Elephants Kruger 2007 POINT (31.87822 -24.81569)
2007-08-13 06:00:00 9421351131 True 2007-08-13 06:00:00.000 31.89554 -24.79870 22.0 gps Loxodonta africana AM105 AM105 ThermochronTracking Elephants Kruger 2007 POINT (31.89554 -24.79870)
In [6]:
tag  = df['tag-local-identifier'].unique()
print(tag)
individual = df['individual-local-identifier'].unique()
print(individual)
name = df['individual-taxon-canonical-name'].unique()
print(name)
['AM105' 'AM107' 'AM108' 'AM110' 'AM239' 'AM253' 'AM254' 'AM255' 'AM306'
 'AM307' 'AM308' 'AM91' 'AM93' 'AM99']
['AM105' 'AM107' 'AM108' 'AM110' 'AM239' 'AM253' 'AM254' 'AM255' 'AM306'
 'AM307' 'AM308' 'AM91' 'AM93' 'AM99']
['Loxodonta africana']
In [7]:
ndf.set_crs(epsg=3857, inplace=True)
original_crs = ndf.crs
original_crs
Out[7]:
<Projected CRS: EPSG:3857>
Name: WGS 84 / Pseudo-Mercator
Axis Info [cartesian]:
- X[east]: Easting (metre)
- Y[north]: Northing (metre)
Area of Use:
- name: World - 85°S to 85°N
- bounds: (-180.0, -85.06, 180.0, 85.06)
Coordinate Operation:
- name: Popular Visualisation Pseudo-Mercator
- method: Popular Visualisation Pseudo Mercator
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich
In [8]:
# Geographic extent: Is the geographical extent as expected and are there holes in the spatial coverage?
ndf.to_crs({'init': 'epsg:3857'}).hvplot(title='Geographic extent of the dataset', geo=True, tiles='OSM', width=500, height=500)
Out[8]:
In [48]:
print("The dataset covers the time between {} and {}.".format(ndf.index.min(), df.index.max()))
The dataset covers the time between 2007-08-13 00:30:00 and 2009-08-12 21:30:00.
In [49]:
print("That's {}".format(df.index.max() - df.index.min()))
That's 730 days 21:00:00
In [51]:
ndf['event-id'].resample('1d').count().hvplot(title='Number of records per day')
Out[51]:
In [53]:
# Does the geographic extent vary over time or do holes appear during certain times?
ndf['Y-M'] = ndf.index.to_period('M')
a = None
for i in ndf['Y-M'].unique():
    plot = ndf[ndf['Y-M']==i].hvplot(title=str(i), size=2, geo=True, tiles='OSM', width=300, height=300)
    if a: a = a + plot
    else: a = plot
a
Out[53]:
In [9]:
#  Is the data sampled at regular or irregular intervals?
t = ndf.reset_index().timestamps
ndf = ndf.assign(delta_t=t.diff().values)
ndf['delta_t'] = ndf['delta_t'].dt.total_seconds()/60
pd.DataFrame(ndf).hvplot.hist('delta_t', title='Histogram of intervals between consecutive records (in minutes)', bins=60, bin_range=(0, 400))
Out[9]:
In [62]:
# Most intervals of consecutive records are 30 minutes (255.144), but goes up to about 360 Minutes (49 counts, about 360 Minutes)
In [10]:
tc = mpd.TrajectoryCollection(ndf, 'tag-local-identifier')
traj = tc.trajectories[0]
traj.add_speed()
max_speed = traj.df.speed.max()
print("The highest computed speed is {:,.2f} m/s ({:,.2f} km/h)".format(max_speed, max_speed*3600/1000))
The highest computed speed is 0.00 m/s (0.00 km/h)
In [11]:
pd.DataFrame(traj.df).hvplot.hist('speed', title='Histogram of speeds (in meters per second)', bins=90)
Out[11]:
In [66]:
# Speed shows no surprising patterns.
In [76]:
# Movement directions
traj.add_direction(overwrite=True)
pd.DataFrame(traj.df).hvplot.hist('direction', title='Histogram of directions', bins=90)
Out[76]:
In [77]:
# There is some variation in movement directions but no directions stand out in the histogram.
In [80]:
# Does the movement make sense in its temporal context?
pd.DataFrame(traj.df).hvplot.heatmap(title='Mean speed by hour of day and month of year', 
                                     x='timestamps.hour', y='timestamps.month', C='speed', reduce_function=np.mean)
Out[80]:
In [81]:
# In the summer months (Dec - March) it can clearly be seen that the animnals move faster
In [82]:
traj.df['n'] = 1
pd.DataFrame(traj.df).hvplot.heatmap(title='Record count by temperature and month of year', 
                                     x='external-temperature', y='timestamps.month', C='n', reduce_function=np.sum)
Out[82]:
In [83]:
pd.DataFrame(traj.df).hvplot.heatmap(title='Mean speed by temperature and month of year', 
                                     x='external-temperature', y='timestamps.month', C='speed', reduce_function=np.mean)
Out[83]:
In [13]:
daily = mpd.TemporalSplitter(tc).split(mode='day')
daily_lengths = [traj.get_length() for traj in daily]
daily_t = [traj.get_start_time() for traj in daily]
daily_lengths = pd.DataFrame(daily_lengths, index=daily_t, columns=['length'])
daily_lengths.hvplot(title='Daily trajectory length')
Out[13]:
In [14]:
# You can see that the trajectory length are longer in the summer months
In [24]:
# convex hulls around trajectory (covered areas)
daily_areas = [(traj.id, traj.to_crs(CRS(3857)).to_linestring().convex_hull.area/10000) for traj in daily]
daily_areas = pd.DataFrame(daily_areas, index=daily_t, columns=['id', 'area'])
daily_areas.hvplot(title='Daily covered area [ha]', y='area')
Out[24]:
In [25]:
# Biggest areas in summer months
daily_areas.sort_values(by='area')[-10:]
Out[25]:
id area
2009-03-01 00:21:00 AM91_2009-03-01 00:00:00 4.859019e-07
2009-03-13 02:31:00 AM110_2009-03-13 00:00:00 4.964314e-07
2008-03-11 00:17:00 AM91_2008-03-11 00:00:00 5.063815e-07
2008-03-11 00:00:00 AM93_2008-03-11 00:00:00 5.089734e-07
2008-07-26 00:00:00 AM307_2008-07-26 00:00:00 5.515618e-07
2008-02-11 00:00:00 AM255_2008-02-11 00:00:00 5.782130e-07
2009-01-21 00:23:00 AM91_2009-01-21 00:00:00 6.026373e-07
2009-01-21 00:00:00 AM93_2009-01-21 00:00:00 6.117384e-07
2009-03-17 00:30:00 AM110_2009-03-17 00:00:00 7.563424e-07
2009-03-17 01:00:00 AM99_2009-03-17 00:00:00 7.676963e-07
In [26]:
daily_areas.sort_values(by='area')[:10] # smallest areas covered (rather in winter)
Out[26]:
id area
2009-05-17 00:00:00 AM308_2009-05-17 00:00:00 0.0
2009-02-02 23:00:00 AM108_2009-02-02 00:00:00 0.0
2009-01-08 14:30:00 AM306_2009-01-08 00:00:00 0.0
2008-10-05 12:30:00 AM307_2008-10-05 00:00:00 0.0
2009-07-25 10:30:00 AM308_2009-07-25 00:00:00 0.0
2008-01-31 01:00:00 AM105_2008-01-31 00:00:00 0.0
2009-07-20 00:00:00 AM105_2009-07-20 00:00:00 0.0
2008-11-13 23:00:00 AM107_2008-11-13 00:00:00 0.0
2009-06-29 13:30:00 AM308_2009-06-29 00:00:00 0.0
2008-12-02 13:00:00 AM306_2008-12-02 00:00:00 0.0
In [27]:
a = None
for i in daily_areas.sort_values(by='area')[-10:].id:
    traj = daily.get_trajectory(i)
    if a: a = a + traj.hvplot(title=i, c='speed', line_width=2, cmap='RdYlBu', width=300, height=300)
    else: a = traj.hvplot(title=i, c='speed', line_width=2, cmap='RdYlBu', width=300, height=300)
a    
Out[27]: